* Project      :  The Full, Persistent, and Symmetric Pass-Through of a Temporary VAT Cut
* Authors      :  Márcia Silva-Pereira, João Quelhas, Tiago Bernardino, Ricardo Duque Gabriel
* Date         :  01/04/2025
* Description  :  Dataset cleaning
* Dependencies :
* Modifications: (add date, author and change)

*********************************************************
*                   Dataset Cleaning                    *
*********************************************************

// Load the cleaned dataset.
use "${path_work}/auxiliar/combined_dataset.dta", clear

*********************************************************

// (1) Construction of relevant variables.

// Compute a price index with base in the day before policy announcement II.
local   base_date 23/03/2023
egen    base_price_23032023 = total(price / (date == td(`base_date'))), by(id_sm)
gen     index_23032023 = (price / base_price_23032023) * 100

// Compute a price index with base in the day before policy announcement II.
local   base_date 26/03/2023
egen    base_price_26032023 = total(price / (date == td(`base_date'))), by(id_sm)
gen     index_26032023 = (price / base_price_26032023) * 100

// Compute a price index with base in the day before policy implementation.
local   base_date 17/04/2023
egen    base_price_17042023 = total(price / (date == td(`base_date'))), by(id_sm)
gen     index_17042023 = (price / base_price_17042023) * 100

// Compute a price index with base in the day before the end of the policy.
local   base_date 04/01/2024
egen    base_price_04012024 = total(price / (date == td(`base_date'))), by(id_sm)
gen     index_04012024 = (price / base_price_04012024) * 100

drop base_price_23032023 base_price_26032023 base_price_17042023 base_price_04012024 regular_price ppu

// Rename variable ecoicop and identify the ECOICOP 4-digit classification.
rename ecoicop ecoicop5

gen ecoicop4 = .
replace ecoicop4 = 0111 if ecoicop5 == 01111 | ecoicop5 == 01113 | ecoicop5 == 01116
replace ecoicop4 = 0112 if ecoicop5 == 01121 | ecoicop5 == 01122 | ecoicop5 == 01124
replace ecoicop4 = 0113 if ecoicop5 == 01131 | ecoicop5 == 01132 
replace ecoicop4 = 0114 if ecoicop5 == 01141 | ecoicop5 == 01142 | ecoicop5 == 01144 | ///
			   ecoicop5 == 01145 | ecoicop5 == 01146 | ecoicop5 == 01147
replace ecoicop4 = 0115 if ecoicop5 == 01151 | ecoicop5 == 01153
replace ecoicop4 = 0116 if ecoicop5 == 01161
replace ecoicop4 = 0117 if ecoicop5 == 01171 | ecoicop5 == 01172 | ecoicop5 == 01174

// Identify products imported (0) and produced in Portugal (1).
gen domestic     = 1 if real(substr(string(ean13, "%14.0f"), 1, 3)) == 560
replace domestic = 0 if (real(substr(string(ean13, "%14.0f"), 1, 3)) != 560 & ean13 != .)

drop ean13

// Identify products from trademark-labels (0) and white-labels (1).
gen white_label = 0
foreach expr in "continente" "auchan" "pingo" "aldi" {
	replace white_label = 1 if strpos(brand, "`expr'") > 0
	}

drop brand

// Identify products from small supermarkets (0) and big supermarkets (1).
gen big = 1 if supermarket == "CONTINENTE" | supermarket == "PINGODOCE"
replace big = 0 if big == .

save "${path_work}/auxiliar/tempfile.dta", replace

// Identify products with price below average (0) and above average (1).
keep if date >= td(01/01/2023) & date < td(01/03/2023)
egen average_price = mean(price), by(id_sm)
keep id_sm average_price
duplicates drop id_sm, force
save "${path_work}/auxiliar/average_price.dta", replace

use "${path_work}/auxiliar/tempfile.dta"

merge m:1 id_sm using "${path_work}/auxiliar/average_price.dta"
drop _merge
sum average_price if treatment == 1, d
return list
gen high_price = 1 if average_price >= `r(mean)' 
replace high_price = 0 if average_price < `r(mean)'

erase "${path_work}/auxiliar/tempfile.dta"

*********************************************************

// (2) Flag products that were not certain to be treated (c.f. Portal das Finanças and Excel VAT_Doubts)

// Generate a new variable 'vat_doubt' and initialize it to missing values.
gen vat_doubt = .

// Loop through each product in the 'doubt' global list.
foreach expr in nabo couve carapau sangue meloa melancia sumo nectar vitelo vitela leitao vitelao novilha novilho boi leitao bacalhau barrigas linguas cabecas caras abas galinha carcaca carcaça espetada espetadas hamburguer tostas panado panada nugget requeijao {
    // Set 'vat_doubt' to 1 for rows where the product name contains the current product in the loop.
    replace vat_doubt = 1 if strpos(name, "`expr'") > 0
}

// Loop through specific products to be flagged for further investigation.
foreach expr in "massa instantanea" "tomate seco" "tomate desidratado" "arroz vaporizado" "batata doce" "leite infantil" "leite crianca" "batata congelada" "cebola chalota" "ervilha torta" "manteiga alho" "manteiga ervas" "manteiga amendoim" "conserva atum tomate" "conserva atum escabeche" "conserva atum tomate" "conserva atum algarvia" "conserva atum picante" "conserva atum caldeirada" "conserva peixe" "mini tomate cherry" "arroz basmati" "arroz jasmim" "rolo de carne" {
    // Set 'vat_doubt' to 1 for rows where the product name contains the current product in the loop.
    replace vat_doubt = 1 if strpos(name, "`expr'") > 0
}

*********************************************************

// (3) Correct measurement errors.

// Change treatment == 1 as these are measurement errors: 
// products classified as control in a COICOP 5 that has only treated products.
replace treatment = 1 if ecoicop5 == 1111 | ecoicop5 == 1113 | ecoicop5 == 1121 | ///
			 ecoicop5 == 1122 | ecoicop5 == 1135 | ecoicop5 == 1141 | ///
			 ecoicop5 == 1142 | ecoicop5 == 1144 | ecoicop5 == 1145 | ///
			 ecoicop5 == 1147 | ecoicop5 == 1151 | ecoicop5 == 1153 | ///
			 ecoicop5 == 1154

replace treatment = 0 if ecoicop5 == 01112 | ecoicop5 == 01115 | ecoicop5 == 01117 | ///
			 ecoicop5 == 01123 | ecoicop5 == 01125 | ecoicop5 == 01126 | ///
			 ecoicop5 == 01127 | ecoicop5 == 01133 | ecoicop5 == 01134 | ///
			 ecoicop5 == 01143 | ecoicop5 == 01152 | ecoicop5 == 01155 | ///
			 ecoicop5 == 01162 | ecoicop5 == 01163 | ecoicop5 == 01164 | ///
			 ecoicop5 == 01175 | ecoicop5 == 01118 | ecoicop5 == 01182 | ///
			 ecoicop5 == 01184 | ecoicop5 == 01185 | ecoicop5 == 01186 | ///
			 ecoicop5 == 01191 | ecoicop5 == 01192 | ecoicop5 == 01193 | ///
			 ecoicop5 == 01194 | ecoicop5 == 01195 | ecoicop5 == 01211 | ///
			 ecoicop5 == 01212 | ecoicop5 == 01213 | ecoicop5 == 01221 | ///
			 ecoicop5 == 01222 | ecoicop5 == 01223 | ecoicop5 == 02111 | ///
			 ecoicop5 == 02112 | ecoicop5 == 02121 | ecoicop5 == 02122 | ///
			 ecoicop5 == 02123 | ecoicop5 == 02124 | ecoicop5 == 02131 | ///
			 ecoicop5 == 02132 | ecoicop5 == 02133 | ecoicop5 == 02134 | ///
			 ecoicop5 == 01181 | ecoicop5 == 01183
		 
*********************************************************

// (4) Add usable time variable to the dataset and set it as a panel.

// Merge datasets based on the 'date' variable.
merge m:1 date using "${path_work}/source/weeks.dta"
drop _merge

gen year = year(date)

// Generate a new variable 'week_year' representing the combination of year and week.
gen week_year = yw(year(date), week_number)

// Drop the original 'week' variable.
drop week_number

// Format 'week_year' as a Stata weekly date.
format week_year %tw

// Create a new variable 'month_year' representing the combination of year and month.
gen month_year = ym(year(date), month(date))

// Format 'month_year' as a Stata monthly date.
format month_year %tm

// Create a new variable 'day' with the day of the observation.
generate day = day(date)

*********************************************************

// (5) Add weights variable.

// Merge datasets based on 'ecoicop' and 'treatment'.
merge m:1 ecoicop5 treatment using "${path_work}/source/ecoicop_categories.dta"
bys ecoicop5 treatment date: gen frequency = _N
gen weight = weight_2023 / frequency
replace weight = 0 if _merge == 1
drop _merge

// Set the data as a panel with daily observations.
xtset id_sm date, daily

// Save the updated dataset to a file named "cleaned_dataset.dta".
save "${path_work}/auxiliar/cleaned_dataset.dta", replace

erase "${path_work}/auxiliar/average_price.dta"

*********************************************************


* Create datasets used in baseline results and appendix.
*********************************************************

use "${path_work}/auxiliar/cleaned_dataset.dta", clear

** (1) DATASET WITH FOOD AND DRINK PRODUCTS

// Exclude observations before January 1, 2023.
keep if date >= td(01/01/2023)

// Exclude treated products that raised doubts.
drop if vat_doubt == 1

// Exclude treated products with a decrease of 23%.
drop if (treatment == 1 & ecoicop5 == 01154)

// Exclude outlier observations.
drop if supermarket == "PINGODOCE" & (day == 1 | day == 2 | day == 31 | day == 30 | day == 28 | day == 29 | day == 27)

// Save the updated dataset.
save "${path_work}/auxiliar/imputation_0_all_products.dta", replace

*********************************************************

** (2) DATASET WITH ONLY FOOD PRODUCTS

use "${path_work}/auxiliar/cleaned_dataset.dta", clear

file open myfile using "${results_tables}/table_b2.txt", write text replace

file write myfile "Table B.2: Data Cleaning Steps for the Baseline Specification" _n _n

qui levelsof id_sm, local(unique_values)
scalar num_unique = r(r)
scalar num_obs = _N
generate unique = num_unique
generate obs = num_obs
format %12.0g unique obs
file write myfile "Step 0: Web scraped products" _n "Items: " (obs) _n "Observations: " (unique) _n _n
drop unique
drop obs

// Exclude observations before January 1, 2023.
keep if date >= td(01/01/2023)

qui levelsof id_sm, local(unique_values)
scalar num_unique = r(r)
scalar num_obs = _N
generate unique = num_unique
generate obs = num_obs
format %12.0g unique obs
file write myfile "Step 1: Exclude observations before January 1, 2023" _n "Items: " (obs) _n "Observations: " (unique) _n _n
drop unique
drop obs

// Exclude treated products that raised doubts.
drop if vat_doubt == 1

qui levelsof id_sm, local(unique_values)
scalar num_unique = r(r)
scalar num_obs = _N
generate unique = num_unique
generate obs = num_obs
format %12.0g unique obs
file write myfile "Step 2: Exclude treated products that raised doubts" _n "Items: " (obs) _n "Observations: " (unique) _n _n
drop unique
drop obs

// Exclude treated products with a decrease of 23%.
drop if (treatment == 1 & ecoicop5 == 01154)

qui levelsof id_sm, local(unique_values)
scalar num_unique = r(r)
scalar num_obs = _N
generate unique = num_unique
generate obs = num_obs
format %12.0g unique obs
file write myfile "Step 3: Exclude treated products with a decrease of 23%" _n "Items: " (obs) _n "Observations: " (unique) _n _n 
drop unique
drop obs

// Exclude outlier observations.
drop if supermarket == "PINGODOCE" & (day == 1 | day == 2 | day == 31 | day == 30 | day == 28 | day == 29 | day == 27)

qui levelsof id_sm, local(unique_values)
scalar num_unique = r(r)
scalar num_obs = _N
generate unique = num_unique
generate obs = num_obs
format %12.0g unique obs
file write myfile "Step 4: Exclude outlier observations" _n "Items: " (obs) _n "Observations: " (unique) _n _n
drop unique
drop obs

* Exclude all drinks.
drop if ecoicop5 == 01211 | ecoicop5 == 01212 | ecoicop5 == 01213 | ///
	ecoicop5 == 01221 | ecoicop5 == 01222 | ecoicop5 == 01223 | ///
	ecoicop5 == 02111 | ecoicop5 == 02112 | ecoicop5 == 02121 | ///
	ecoicop5 == 02122 | ecoicop5 == 02123 | ecoicop5 == 02124 | ///
	ecoicop5 == 02131 | ecoicop5 == 02132 | ecoicop5 == 02133 | ///
	ecoicop5 == 02134

qui levelsof id_sm, local(unique_values)
scalar num_unique = r(r)
scalar num_obs = _N
generate unique = num_unique
generate obs = num_obs
format %12.0g unique obs
file write myfile "Step 5: Exclude all drinks" _n "Items: " (obs) _n "Observations: " (unique)
drop unique
drop obs

// Save the updated dataset.
save "${path_work}/auxiliar/imputation_0_only_food.dta", replace

file close myfile

*********************************************************

** (3) DATASET WITH ONLY 6% VAT PRODUCTS

use "${path_work}/auxiliar/cleaned_dataset.dta", clear

// Exclude observations before January 1, 2023.
keep if date >= td(01/01/2023)

// Exclude treated products that raised doubts.
drop if vat_doubt == 1

// Exclude treated products with a decrease of 23%.
drop if (treatment == 1 & ecoicop5 == 01154)

// Exclude outlier observations.
drop if supermarket == "PINGODOCE" & (day == 1 | day == 2 | day == 31 | day == 30 | day == 28 | day == 29 | day == 27)

* Keep only coicop 5 with 6% VAT (reduced tax rate).
keep if ecoicop5 == 01121 | ecoicop5 == 01122 | ecoicop5 == 01123 | ///
	ecoicop5 == 01124 | ecoicop5 == 01125 | ecoicop5 == 01126 | ///
	ecoicop5 == 01131 | ecoicop5 == 01133 | ecoicop5 == 01147 | ///
	ecoicop5 == 01161 | ecoicop5 == 01171 | ecoicop5 == 01111 | ///
	ecoicop5 == 01112 | ecoicop5 == 01113 | ecoicop5 == 01114 | ///
	ecoicop5 == 01115 | ecoicop5 == 01116 | ecoicop5 == 01117 | ///
	ecoicop5 == 01118 | ecoicop5 == 01127 | ecoicop5 == 01128 | ///
	ecoicop5 == 01132 | ecoicop5 == 01134 | ecoicop5 == 01135 | ///
	ecoicop5 == 01136 | ecoicop5 == 01141 | ecoicop5 == 01142 | ///
	ecoicop5 == 01143 | ecoicop5 == 01144 | ecoicop5 == 01145 | ///
	ecoicop5 == 01146 | ecoicop5 == 01151 | ecoicop5 == 01152 | ///
	ecoicop5 == 01153 | ecoicop5 == 01154 | ecoicop5 == 01155 | ///
	ecoicop5 == 01162 | ecoicop5 == 01163 | ecoicop5 == 01164 | ///
	ecoicop5 == 01172 | ecoicop5 == 01173 | ecoicop5 == 01174 | ///
	ecoicop5 == 01175 | ecoicop5 == 01176 | ecoicop5 == 01181 | ///
	ecoicop5 == 01182 | ecoicop5 == 01183 | ecoicop5 == 01184 | ///
	ecoicop5 == 01185 | ecoicop5 == 01186 | ecoicop5 == 01191 | ///
	ecoicop5 == 01192 | ecoicop5 == 01193 | ecoicop5 == 01194 | ///
	ecoicop5 == 01199 | ecoicop5 == 01211 | ecoicop5 == 01212 | ///
	ecoicop5 == 01213 | ecoicop5 == 01221 | ecoicop5 == 01222 | ///
	ecoicop5 == 01223

// Save the updated dataset.
save "${path_work}/auxiliar/imputation_0_reduced_tax_coicop.dta", replace

*********************************************************

** (4) DATASET FOR OILS ANALYSIS

use "${path_work}/auxiliar/cleaned_dataset.dta", clear

// Exclude observations before January 1, 2023.
keep if date >= td(01/01/2023)

// Exclude treated products that raised doubts.
drop if vat_doubt == 1

// Exclude products that had a decrease in VAT from 23% to 0%.
keep if (treatment == 1 & ecoicop5 == 01154) | treatment == 0

// Exclude outlier observations.
drop if supermarket == "PINGODOCE" & (day == 1 | day == 2 | day == 31 | day == 30 | day == 28 | day == 29 | day == 27)

// Save the updated dataset.
save "${path_work}/auxiliar/imputation_0_oils.dta", replace

*********************************************************

erase "${path_work}/auxiliar/cleaned_dataset.dta"

*********************************************************
